{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "BOwsuGQQY9OL",
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "\n",
    "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
    "from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional\n",
    "from tensorflow.keras.preprocessing.text import Tokenizer\n",
    "from tensorflow.keras.models import Sequential\n",
    "from tensorflow.keras.optimizers import Adam\n",
    "import numpy as np \n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "height": 238
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 2494,
     "status": "ok",
     "timestamp": 1558917564071,
     "user": {
      "displayName": "Laurence Moroney",
      "photoUrl": "https://lh3.googleusercontent.com/-RcxktLY-TBk/AAAAAAAAAAI/AAAAAAAAABY/b4V4dTIqmPI/s64/photo.jpg",
      "userId": "06401446828348966425"
     },
     "user_tz": 420
    },
    "id": "pylt5qZYsWPh",
    "outputId": "55e4ba98-9c52-4bcd-d369-bd76336e5fce",
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": [
    "print(time.time())\n",
    "# !wget --no-check-certificate \\\n",
    "#     https://storage.googleapis.com/laurencemoroney-blog.appspot.com/irish-lyrics-eof.txt \\\n",
    "#     -O irish-lyrics-eof.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "height": 71
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 522,
     "status": "ok",
     "timestamp": 1558917567068,
     "user": {
      "displayName": "Laurence Moroney",
      "photoUrl": "https://lh3.googleusercontent.com/-RcxktLY-TBk/AAAAAAAAAAI/AAAAAAAAABY/b4V4dTIqmPI/s64/photo.jpg",
      "userId": "06401446828348966425"
     },
     "user_tz": 420
    },
    "id": "PRnDnCW-Z7qv",
    "outputId": "0bdd6f6d-44ac-49e9-9e52-929e36b78b3e",
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": [
    "tokenizer = Tokenizer()\n",
    "\n",
    "data = open('../../tensorflow_datasets/irish-lyrics-eof.txt').read()\n",
    "\n",
    "corpus = data.lower().split(\"\\n\")\n",
    "\n",
    "tokenizer.fit_on_texts(corpus)\n",
    "total_words = len(tokenizer.word_index) + 1\n",
    "\n",
    "print(tokenizer.word_index)\n",
    "print(total_words)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "soPGVheskaQP",
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": [
    "\n",
    "input_sequences = []\n",
    "for line in corpus:\n",
    "\ttoken_list = tokenizer.texts_to_sequences([line])[0]\n",
    "\tfor i in range(1, len(token_list)):\n",
    "\t\tn_gram_sequence = token_list[:i+1]\n",
    "\t\tinput_sequences.append(n_gram_sequence)\n",
    "\n",
    "# pad sequences \n",
    "max_sequence_len = max([len(x) for x in input_sequences])\n",
    "input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))\n",
    "\n",
    "# create predictors and label\n",
    "xs, labels = input_sequences[:,:-1],input_sequences[:,-1]\n",
    "\n",
    "ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "height": 153
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 461,
     "status": "ok",
     "timestamp": 1558917571409,
     "user": {
      "displayName": "Laurence Moroney",
      "photoUrl": "https://lh3.googleusercontent.com/-RcxktLY-TBk/AAAAAAAAAAI/AAAAAAAAABY/b4V4dTIqmPI/s64/photo.jpg",
      "userId": "06401446828348966425"
     },
     "user_tz": 420
    },
    "id": "pJtwVB2NbOAP",
    "outputId": "b02c5e5a-79c7-421c-b627-2464ddd8771b",
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": [
    "print(tokenizer.word_index['in'])\n",
    "print(tokenizer.word_index['the'])\n",
    "print(tokenizer.word_index['town'])\n",
    "print(tokenizer.word_index['of'])\n",
    "print(tokenizer.word_index['athy'])\n",
    "print(tokenizer.word_index['one'])\n",
    "print(tokenizer.word_index['jeremy'])\n",
    "print(tokenizer.word_index['lanigan'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "height": 34
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 469,
     "status": "ok",
     "timestamp": 1558917573259,
     "user": {
      "displayName": "Laurence Moroney",
      "photoUrl": "https://lh3.googleusercontent.com/-RcxktLY-TBk/AAAAAAAAAAI/AAAAAAAAABY/b4V4dTIqmPI/s64/photo.jpg",
      "userId": "06401446828348966425"
     },
     "user_tz": 420
    },
    "id": "49Cv68JOakwv",
    "outputId": "0a5e4791-c270-45f9-b886-79a4ce85a18b",
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": [
    "print(xs[6])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "height": 34
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 469,
     "status": "ok",
     "timestamp": 1558917574741,
     "user": {
      "displayName": "Laurence Moroney",
      "photoUrl": "https://lh3.googleusercontent.com/-RcxktLY-TBk/AAAAAAAAAAI/AAAAAAAAABY/b4V4dTIqmPI/s64/photo.jpg",
      "userId": "06401446828348966425"
     },
     "user_tz": 420
    },
    "id": "iY-jwvfgbEF8",
    "outputId": "6e0fed89-6e1a-4e49-c0a4-1aa8f39c3432",
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": [
    "print(ys[6])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "height": 51
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 481,
     "status": "ok",
     "timestamp": 1558917576380,
     "user": {
      "displayName": "Laurence Moroney",
      "photoUrl": "https://lh3.googleusercontent.com/-RcxktLY-TBk/AAAAAAAAAAI/AAAAAAAAABY/b4V4dTIqmPI/s64/photo.jpg",
      "userId": "06401446828348966425"
     },
     "user_tz": 420
    },
    "id": "wtzlUMYadhKt",
    "outputId": "8d05f736-c8de-4a25-c4f9-e8f842690f62",
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": [
    "print(xs[5])\n",
    "print(ys[5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "height": 54
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 454,
     "status": "ok",
     "timestamp": 1558917579429,
     "user": {
      "displayName": "Laurence Moroney",
      "photoUrl": "https://lh3.googleusercontent.com/-RcxktLY-TBk/AAAAAAAAAAI/AAAAAAAAABY/b4V4dTIqmPI/s64/photo.jpg",
      "userId": "06401446828348966425"
     },
     "user_tz": 420
    },
    "id": "H4myRpB1c4Gg",
    "outputId": "48f6a5d6-cfe0-4fcb-dba3-c04c6c20d8c6",
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": [
    "print(tokenizer.word_index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "height": 3497
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 815477,
     "status": "ok",
     "timestamp": 1558918396278,
     "user": {
      "displayName": "Laurence Moroney",
      "photoUrl": "https://lh3.googleusercontent.com/-RcxktLY-TBk/AAAAAAAAAAI/AAAAAAAAABY/b4V4dTIqmPI/s64/photo.jpg",
      "userId": "06401446828348966425"
     },
     "user_tz": 420
    },
    "id": "w9vH8Y59ajYL",
    "outputId": "c8cd3256-d2f7-4b9e-b10a-b2b0538bb246",
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "model = Sequential()\n",
    "model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))\n",
    "model.add(Bidirectional(LSTM(150)))\n",
    "model.add(Dense(total_words, activation='softmax'))\n",
    "adam = Adam(lr=0.01)\n",
    "model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])\n",
    "#earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto')\n",
    "history = model.fit(xs, ys, epochs=100, verbose=1)\n",
    "#print model.summary()\n",
    "print(model)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "3YXGelKThoTT",
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "\n",
    "def plot_graphs(history, string):\n",
    "  plt.plot(history.history[string])\n",
    "  plt.xlabel(\"Epochs\")\n",
    "  plt.ylabel(string)\n",
    "  plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "height": 297
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 576,
     "status": "ok",
     "timestamp": 1558918412141,
     "user": {
      "displayName": "Laurence Moroney",
      "photoUrl": "https://lh3.googleusercontent.com/-RcxktLY-TBk/AAAAAAAAAAI/AAAAAAAAABY/b4V4dTIqmPI/s64/photo.jpg",
      "userId": "06401446828348966425"
     },
     "user_tz": 420
    },
    "id": "poeprYK8h-c7",
    "outputId": "ffaf6f0e-fcbc-450d-d872-884c24f8bb08",
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "plot_graphs(history, 'accuracy')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "height": 54
    },
    "colab_type": "code",
    "executionInfo": {
     "elapsed": 1590,
     "status": "ok",
     "timestamp": 1558930962952,
     "user": {
      "displayName": "Laurence Moroney",
      "photoUrl": "https://lh3.googleusercontent.com/-RcxktLY-TBk/AAAAAAAAAAI/AAAAAAAAABY/b4V4dTIqmPI/s64/photo.jpg",
      "userId": "06401446828348966425"
     },
     "user_tz": 420
    },
    "id": "6Vc6PHgxa6Hm",
    "outputId": "5912b513-dac1-4129-c2e6-0bdf4caeef8d",
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "seed_text = \"I've got a bad feeling about this\"\n",
    "next_words = 100\n",
    "  \n",
    "for _ in range(next_words):\n",
    "\ttoken_list = tokenizer.texts_to_sequences([seed_text])[0]\n",
    "\ttoken_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')\n",
    "\tpredicted = model.predict_classes(token_list, verbose=0)\n",
    "\toutput_word = \"\"\n",
    "\tfor word, index in tokenizer.word_index.items():\n",
    "\t\tif index == predicted:\n",
    "\t\t\toutput_word = word\n",
    "\t\t\tbreak\n",
    "\tseed_text += \" \" + output_word\n",
    "print(seed_text)\n",
    "print(time.time())\n"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "name": "Course 3 - Week 1 - Lesson 2 - Notebook.ipynb",
   "provenance": [
    {
     "file_id": "1Zn6Lgn4cEJiA_uHym2oOWtQcsjwwGFG1",
     "timestamp": 1558732299987
    },
    {
     "file_id": "1V60jn23JMcMpwhF-KvCTYIIfm4J2X6v5",
     "timestamp": 1558707866173
    }
   ],
   "version": "0.3.2"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
    "metadata": {
     "collapsed": false
    },
    "source": []
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
